#   LibAut
#       onset regressions: multiple, leave-pair-out CV
#       LPO-CV for Table 1 column (model) 1, and for Table C3
#   Juraj Medzihorsky
#   2021-08-11

library(logistf)
library(parallel)
options(mc.cores=14) # written for a 16-thread linux machine
options(stringsAsFactors=F)
source('helpers.R')

#   sessionInfo()
##  R version 4.0.4 (2021-02-15)
##  Platform: x86_64-pc-linux-gnu (64-bit)
##  Running under: Ubuntu 20.04.2 LTS
##
##  Matrix products: default
##  BLAS/LAPACK: /opt/OpenBLAS/lib/libopenblas_haswellp-r0.3.13.so
##
##  locale:
##   [1] LC_CTYPE=C.UTF-8           LC_NUMERIC=C
##   [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C.UTF-8
##   [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C.UTF-8
##   [7] LC_PAPER=C.UTF-8           LC_NAME=C
##   [9] LC_ADDRESS=C               LC_TELEPHONE=C
##  [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
##  attached base packages:
##  [1] parallel  stats     graphics  grDevices utils     datasets  methods
##  [8] base
##
##  other attached packages:
##  [1] logistf_1.24   nvimcom_0.9-92
##
##  loaded via a namespace (and not attached):
##   [1] Rcpp_1.0.6           magrittr_2.0.1       splines_4.0.4
##   [4] tidyselect_1.1.0     lattice_0.20-41      R6_2.5.0
##   [7] rlang_0.4.10         dplyr_1.0.4          tools_4.0.4
##  [10] grid_4.0.4           broom_0.7.5          nlme_3.1-152
##  [13] mgcv_1.8-34          DBI_1.1.1            ellipsis_0.3.1
##  [16] formula.tools_1.7.1  assertthat_0.2.1     tibble_3.0.6
##  [19] lifecycle_1.0.0      crayon_1.4.1         Matrix_1.3-2
##  [22] tidyr_1.1.2          purrr_0.3.4          operator.tools_1.6.3
##  [25] vctrs_0.3.6          glue_1.4.2           mice_3.13.0
##  [28] compiler_4.0.4       pillar_1.4.7         backports_1.2.1
##  [31] generics_0.1.0       pkgconfig_2.0.3

#------------------------------------------------------------------------------

code_dir <- getwd()
data_dir <- gsub('code$', 'data', code_dir)

setwd(data_dir)
dat <- readRDS('onset_20210810.rds')
eps <- read.csv('eps.csv')
setwd(code_dir)
dat <- subset(dat, onset_eligible%in%1)

#------------------------------------------------------------------------------
#   get EP shares
tab_dem <- aggregate(eps$dem_ep, by=list(eps$year), FUN=mean, na.rm=T)
tab_aut <- aggregate(eps$aut_ep, by=list(eps$year), FUN=mean, na.rm=T)
names(tab_dem) <- c('year', 'share_dem_ep')
names(tab_aut) <- c('year', 'share_aut_ep')

tab_lag <- tab <- merge(tab_dem, tab_aut, by='year')
tab_lag$year <- tab$year + 1
names(tab_lag)[2:3] <- paste0('lag1_', names(tab)[2:3])
tab_tab <- merge(tab, tab_lag)

dat <- merge(dat, tab_tab, by='year')

weird <- which(abs(dat$lag1_e_migdpgro)>1)
dat$lag1_e_migdpgro[weird] <- NA
dat$logpop <- log(dat$e_mipopula) 
dat$reg_fac <- as.factor(dat$e_regionpol_6C)

#   filter out missing
vars_to_go <- 
    c('year',
      'libaut_start', 
      'reg_fac', 
      'lag1_e_migdppcln',  
      'lag1_e_migdpgro',  
      'lag1_logpop', 
      'lag1_v2pepwrsoc',  
      'lag1_v2xeg_eqdr',  
      'lag1_v2xnp_pres',  
      'lag1_v2csprtcpt', 
      'lag1_v2x_polyarchy', 
      'lag1_excl_region_edi',  
      'lag1_e_miinteco',  
      'lag1_e_miinterc', 
      'lag1_share_dem_ep', 
      'lag1_share_aut_ep')  

na_fil <- rowSums(is.na(dat[,vars_to_go]))==0
dat <- dat[na_fil, ]

#------------------------------------------------------------------------------
#   models

#   formulas
fy0 <- libaut_start ~ 1
lin_1 <- ~ . + 
    reg_fac +
    lag1_e_migdppcln + 
    lag1_e_migdpgro + 
    lag1_logpop +
    lag1_v2pepwrsoc + 
    lag1_v2xeg_eqdr + 
    lag1_v2xnp_pres + 
    lag1_v2csprtcpt +
    lag1_v2x_polyarchy +
    lag1_excl_region_edi + 
    lag1_e_miinteco + 
    lag1_e_miinterc +
    lag1_share_dem_ep +
    lag1_share_aut_ep  
lin_2 <- ~ . + 
    I(1e-1*(year-1990)) + I((1e-1*(year-1990))^2) + I((1e-1*(year-1990))^3) 
fl1 <- update(fy0, lin_1)
fl2 <- update(fl1, lin_2)

#   gaussian-linear model under MLE for one left-out pair
onemlegau <-
    function(ii, dat, form)
    {
        ii <- unlist(ii)    
        m <- lm(form, data=dat[-ii,])
        p <- predict(m, newdata=dat[ii,], type='response')
        auc <- as.numeric(p[1]<p[2])
        cc <- mean(round(p)==dat[ii,'libaut_start'])
        return(data.frame(auc=auc, cc=cc))
    }


#   bernoulli-logistic model under MLE for one left-out pair
onemlelgt <-
    function(ii, dat, form)
    {
        ii <- unlist(ii)    
        m <- glm(form, data=dat[-ii,], family=binomial('logit'))
        p <- predict(m, newdata=dat[ii,], type='response')
        auc <- as.numeric(p[1]<p[2])
        cc <- mean(round(p)==dat[ii,'libaut_start'])
        return(data.frame(auc=auc, cc=cc))
    }


#   bernoulli-logistic model under FPL for one left-out pair
onelpo <-
    function(ii, dat, nd, form, 
             mcont=logistf.control(maxit=1e4, maxstep=1e4))
    {
        ii <- unlist(ii)
        m <- logistf(form, data=dat[-ii,], control=mcont)
        lp <- nd[ii,] %*% matrix(coef(m), ncol=1)
        p <- plogis(lp)
        auc <- as.numeric(p[1]<p[2])
        cc <- mean(round(p)==dat[ii,'libaut_start'])
        return(data.frame(auc=auc, cc=cc))
    }

#------------------------------------------------------------------------------

#   leave-pair-out grid: all 0-1 paits
gp <- expand.grid(y0=which(dat$libaut_start%in%0),
                  y1=which(dat$libaut_start%in%1))

#   prep model matrices for FPL models
mdat1 <- model.matrix(fl1, dat)
mdat2 <- model.matrix(fl2, dat)

#------------------------------------------------------------------------------
#   execute

rr <- 1:nrow(gp)
set.seed(123)
rr <- sample(1:nrow(gp), 1e4, replace=F)

system.time(mle_gau_1 <- do.call(rbind, mclapply(rr, function(j) onemlegau(gp[j,], dat=dat, form=fl1))))
system.time(mle_gau_2 <- do.call(rbind, mclapply(rr, function(j) onemlegau(gp[j,], dat=dat, form=fl2))))
system.time(mle_lgt_1 <- do.call(rbind, mclapply(rr, function(j) onemlelgt(gp[j,], dat=dat, form=fl1))))
system.time(mle_lgt_2 <- do.call(rbind, mclapply(rr, function(j) onemlelgt(gp[j,], dat=dat, form=fl2))))
system.time(fpl_lgt_1 <- do.call(rbind, mclapply(rr, function(j) onelpo(gp[j,], dat=dat, nd=mdat1, form=fl1))))
system.time(fpl_lgt_2 <- do.call(rbind, mclapply(rr, function(j) onelpo(gp[j,], dat=dat, nd=mdat2, form=fl2))))

#------------------------------------------------------------------------------
#   save

setwd(data_dir)
save(list=c('mle_gau_1', 'mle_gau_2',
            'mle_lgt_1', 'mle_lgt_2', 
            'fpl_lgt_1', 'fpl_lgt_2'), 
     file='onset_regressions_multiple_lpo_20210811.rdata')

#   for Table C.3
#   round(rbind(
#         cbind(mean(mle_gau_1$auc), mean(mle_gau_2$auc),
#               mean(mle_lgt_1$auc), mean(mle_lgt_2$auc),
#               mean(fpl_lgt_1$auc), mean(fpl_lgt_2$auc)),
#         cbind(mean(mle_gau_1$cc), mean(mle_gau_2$cc),
#               mean(mle_lgt_1$cc), mean(mle_lgt_2$cc),
#               mean(fpl_lgt_1$cc), mean(fpl_lgt_2$cc))),
#         2)
#   #   the first row, last column is in Table 1, column (model) 1 
#   #   the first row is the bottom row of Table C.3
#   #   0.76 0.78 0.76 0.78 0.76 0.78
#   #   0.50 0.50 0.51 0.51 0.51 0.51

#   SCRIPT END